{% extends "base.html" %}
{% from "components/custom_dropdown.html" import render_dropdown %}

{% set active_page = 'benchmark' %}

{% block title %}Benchmark Configuration - Deep Research System{% endblock %}

{% block extra_head %}
<meta name="csrf-token" content="{{ csrf_token() }}">
<link rel="stylesheet" href="{{ url_for('research.serve_static', path='css/custom_dropdown.css') }}">
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
.benchmark-card {
    width: 100%;
    margin: 0;
}

.dataset-config {
    border: 1px solid var(--border-color);
    border-radius: 8px;
    padding: 20px;
    margin-bottom: 20px;
    background: var(--card-bg);
}

.dataset-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    margin-bottom: 15px;
}

.dataset-toggle {
    display: flex;
    align-items: center;
    gap: 10px;
}

.benchmark-progress {
    margin-top: 20px;
    padding: 20px;
    background: var(--card-bg);
    border-radius: 8px;
    border: 1px solid var(--border-color);
    display: none;
}

.progress-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    margin-bottom: 15px;
}

.progress-bar {
    width: 100%;
    height: 20px;
    background: var(--bg-color);
    border-radius: 10px;
    overflow: hidden;
    margin-bottom: 15px;
}

.progress-fill {
    height: 100%;
    background: linear-gradient(90deg, var(--primary-color), var(--accent-color));
    width: 0%;
    transition: width 0.3s ease;
}

.metrics-grid {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
    gap: 15px;
    margin-top: 15px;
}

.metric-card {
    padding: 15px;
    background: var(--bg-color);
    border-radius: 6px;
    text-align: center;
}

.metric-value {
    font-size: 1.5rem;
    font-weight: bold;
    color: var(--primary-color);
}

.metric-label {
    font-size: 0.9rem;
    color: var(--text-muted);
    margin-top: 5px;
}

.dataset-accuracy {
    display: flex;
    justify-content: space-between;
    margin-top: 10px;
    padding: 10px;
    background: var(--bg-color);
    border-radius: 6px;
}

.alert {
    padding: 15px;
    border-radius: 6px;
    margin-bottom: 15px;
}

.alert-warning {
    background-color: #fff3cd;
    border: 1px solid #ffeaa7;
    color: #856404;
}

.alert i {
    margin-right: 8px;
}

/* Question and Results Display Styles */
.benchmark-section {
    margin-top: 20px;
}

.question-card {
    background: #1a1a1a;
    border: 1px solid #333;
    border-radius: 8px;
    padding: 15px;
    margin-bottom: 10px;
}

.question-content {
    margin-bottom: 10px;
}

.question-text {
    font-size: 1rem;
    line-height: 1.4;
    color: #e0e0e0;
    margin-bottom: 8px;
    padding: 10px;
    background: #2a2a2a;
    border-radius: 4px;
    border-left: 4px solid var(--primary-color);
}

.question-meta {
    display: flex;
    gap: 10px;
    font-size: 0.85rem;
    color: var(--text-muted);
}

.dataset-badge {
    background: var(--primary-color);
    color: white;
    padding: 2px 8px;
    border-radius: 12px;
    font-size: 0.8rem;
    font-weight: 500;
}

.search-count-badge {
    color: white;
    padding: 2px 6px;
    border-radius: 10px;
    font-size: 0.75rem;
    font-weight: 500;
    margin-left: 8px;
}

.search-count-badge.critical {
    background: #f44336; /* Red for 0-1 results */
}

.search-count-badge.warning {
    background: #ff9800; /* Orange for 2-4 results */
}

.search-count-badge.good {
    background: #4caf50; /* Green for 5+ results */
}

.processing-status {
    padding: 8px 12px;
    background: var(--bg-secondary);
    border-radius: 4px;
    font-size: 0.9rem;
    color: var(--text-muted);
}

.processing-status.processing {
    background: #ff9800;
    color: #ffffff;
}

.processing-status.completed {
    background: #e8f5e8;
    color: #2e7d32;
}

.result-card {
    background: #1a1a1a;
    border: 1px solid #333;
    border-radius: 6px;
    padding: 12px;
    margin-bottom: 8px;
    transition: border-color 0.2s;
}

.result-card.correct {
    border-left: 4px solid #4caf50;
}

.result-card.incorrect {
    border-left: 4px solid #f44336;
}

.result-header {
    display: flex;
    justify-content: between;
    align-items: center;
    margin-bottom: 8px;
    font-size: 0.85rem;
    color: #a0a0a0;
}

.result-status {
    font-weight: 600;
}

.result-status.correct {
    color: #4caf50;
}

.result-status.incorrect {
    color: #f44336;
}

.answer-comparison {
    display: grid;
    gap: 8px;
}

.answer-box {
    padding: 12px;
    border-radius: 4px;
    font-size: 0.95rem;
    line-height: 1.5;
    white-space: pre-wrap;
    word-break: break-word;
    min-height: 60px;
    color: #e0e0e0 !important;
}

.answer-box > div {
    margin-top: 5px;
    color: #e0e0e0 !important;
}

.model-answer {
    background: #1e2a3a;
    border-left: 4px solid #2196f3;
    color: #e0e0e0 !important;
}

.correct-answer {
    background: #1e3a1e;
    border-left: 4px solid #4caf50;
    color: #e0e0e0 !important;
}

.answer-label {
    font-size: 0.75rem;
    font-weight: 600;
    color: #a0a0a0;
    margin-bottom: 4px;
    text-transform: uppercase;
    letter-spacing: 0.5px;
}

.no-results {
    text-align: center;
    color: var(--text-muted);
    padding: 20px;
    font-style: italic;
}

#recent-results-container {
    max-height: 600px;
    overflow-y: auto;
}

/* Improved layout structure */
.page-content {
    width: 100%;
    max-width: none;
}

.benchmark-progress .card {
    width: 100%;
    max-width: none;
}

.form-group {
    width: 100%;
}

/* Better visual hierarchy */
.benchmark-guidelines {
    background: linear-gradient(135deg, #1e1e1e 0%, #2a2a2a 100%);
    border: 1px solid #404040;
    border-left: 4px solid var(--primary-color);
    border-radius: 8px;
    margin-bottom: 25px;
    box-shadow: 0 2px 8px rgba(0,0,0,0.3);
}

.guidelines-content {
    display: grid;
    grid-template-columns: 1fr auto;
    gap: 25px;
    align-items: start;
}

.guidelines-text {
    padding: 25px;
}

.guidelines-sidebar {
    min-width: 200px;
    background: rgba(var(--primary-color-rgb), 0.1);
    padding: 20px;
    border-radius: 0 8px 8px 0;
    text-align: center;
    border-left: 1px solid rgba(var(--primary-color-rgb), 0.2);
}

/* Enhanced form sections */
.form-section {
    background: #1a1a1a;
    border: 1px solid #333;
    border-radius: 8px;
    margin-bottom: 20px;
    overflow: hidden;
}

.form-section-header {
    background: linear-gradient(90deg, #2a2a2a 0%, #333 100%);
    padding: 15px 20px;
    border-bottom: 1px solid #404040;
}

.form-section-title {
    color: var(--primary-color);
    font-size: 1.1rem;
    font-weight: 600;
    margin: 0;
    display: flex;
    align-items: center;
    gap: 8px;
}

.form-section-content {
    padding: 20px;
}

/* Improved dataset configuration cards */
.dataset-grid {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
    gap: 20px;
}

.dataset-card {
    background: #1e1e1e;
    border: 2px solid #333;
    border-radius: 10px;
    padding: 20px;
    transition: all 0.3s ease;
    position: relative;
}

.dataset-card:hover {
    border-color: var(--primary-color);
    box-shadow: 0 4px 12px rgba(var(--primary-color-rgb), 0.2);
}

.dataset-card.disabled {
    opacity: 0.6;
    border-color: #555;
}

.dataset-header {
    display: flex;
    justify-content: space-between;
    align-items: flex-start;
    margin-bottom: 15px;
}

.dataset-info h3 {
    color: #e0e0e0;
    margin: 0 0 5px 0;
    font-size: 1.2rem;
}

.dataset-info p {
    color: #a0a0a0;
    margin: 0;
    font-size: 0.9rem;
}

.dataset-toggle {
    display: flex;
    align-items: center;
    gap: 8px;
}

/* Modern toggle switch */
.toggle-switch {
    position: relative;
    width: 50px;
    height: 24px;
    background: #555;
    border-radius: 12px;
    cursor: pointer;
    transition: background 0.3s;
}

.toggle-switch.active {
    background: var(--primary-color);
}

.toggle-switch::after {
    content: '';
    position: absolute;
    top: 2px;
    left: 2px;
    width: 20px;
    height: 20px;
    background: white;
    border-radius: 50%;
    transition: transform 0.3s;
}

.toggle-switch.active::after {
    transform: translateX(26px);
}

/* Enhanced input styling */
.form-control {
    background: #2a2a2a;
    border: 2px solid #404040;
    border-radius: 6px;
    padding: 10px 12px;
    color: #e0e0e0;
    font-size: 0.95rem;
    transition: border-color 0.3s, box-shadow 0.3s;
}

.form-control:focus {
    border-color: var(--primary-color);
    box-shadow: 0 0 0 3px rgba(var(--primary-color-rgb), 0.2);
    outline: none;
}

/* Responsive improvements */
@media (max-width: 1200px) {
    .guidelines-content {
        grid-template-columns: 1fr;
    }

    .guidelines-sidebar {
        border-radius: 0 0 8px 8px;
        border-left: none;
        border-top: 1px solid rgba(var(--primary-color-rgb), 0.2);
    }
}

@media (max-width: 768px) {
    .dataset-grid {
        grid-template-columns: 1fr;
    }

    .guidelines-text {
        padding: 20px;
    }

    .form-section-content {
        padding: 15px;
    }
}


/* Performance Charts Styles */
.charts-section {
    margin-top: 20px;
}

.charts-grid {
    display: grid;
    grid-template-columns: 1fr 1fr;
    gap: 20px;
    margin-top: 15px;
}

.chart-container {
    background: #1a1a1a;
    border: 1px solid #333;
    border-radius: 8px;
    padding: 15px;
    height: 300px;
}

.chart-title {
    color: #e0e0e0;
    font-size: 1rem;
    font-weight: 600;
    margin-bottom: 10px;
    text-align: center;
}

.chart-canvas {
    width: 100% !important;
    height: 250px !important;
}

@media (max-width: 768px) {
    .charts-grid {
        grid-template-columns: 1fr;
    }
}

/* Evaluation Settings Styles */
.form-row {
    display: grid;
    grid-template-columns: 1fr 1fr;
    gap: 20px;
    margin-bottom: 15px;
}

.form-group.half {
    margin-bottom: 0;
}

@media (max-width: 768px) {
    .form-row {
        grid-template-columns: 1fr;
        gap: 15px;
    }
}
</style>
{% endblock %}

{% block content %}
<div class="page active" id="benchmark">
    <div class="page-header">
        <h1>Benchmark Configuration</h1>
        <p class="page-subtitle">Test and optimize your search configurations</p>
        <div style="margin-top: 10px;">
            <a href="{{ url_for('benchmark.results') }}" class="btn btn-secondary">
                <i class="fas fa-chart-line"></i> View Past Results
            </a>
        </div>
    </div>

    <!-- Benchmark Usage Guidelines -->
    <div class="benchmark-guidelines">
        <div class="guidelines-content">
            <div class="guidelines-text">
                <h3 style="color: var(--primary-color); margin-bottom: 15px; font-size: 1.3rem;">
                    <i class="fas fa-info-circle"></i> Benchmark Guidelines
                </h3>
                <p style="margin-bottom: 12px; line-height: 1.6; color: #e0e0e0;">
                    <strong>Purpose:</strong> Benchmarks are designed to help you evaluate if your configuration works well, not for research papers or production use.
                </p>
                <p style="margin-bottom: 12px; line-height: 1.6; color: #e0e0e0;">
                    <strong>Responsible Usage:</strong> Please use reasonable example counts to avoid overwhelming search engines. The default of 75 examples provides a good balance for configuration testing.
                </p>
                <p style="margin-bottom: 18px; line-height: 1.6; color: #e0e0e0;">
                    <strong>Requirements:</strong> Benchmarks require an evaluation model for grading results. You can configure your preferred provider and model in the Evaluation Settings below. The default uses OpenRouter with Claude 3.7 Sonnet, but you can choose from various providers including OpenAI, Anthropic, or local models.
                </p>
                <div style="background: rgba(255, 167, 38, 0.1); border: 1px solid rgba(255, 167, 38, 0.3); padding: 15px; border-radius: 8px; margin-top: 15px;">
                    <h4 style="color: #ffa726; margin-bottom: 10px; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
                        <i class="fas fa-search"></i> Search Engine Recommendations
                    </h4>
                    <ul style="margin: 0; padding-left: 20px; font-size: 0.95rem; line-height: 1.5; color: #e0e0e0;">
                        <li style="margin-bottom: 8px;"><strong style="color: #4caf50;">Tavily:</strong> Recommended for general knowledge benchmarks - AI-optimized search API, reliable results</li>
                        <li style="margin-bottom: 8px;"><strong style="color: #2196f3;">Brave:</strong> Independent search engine but unknown why performance is lower - could be smaller index, different ranking algorithm, or API limitations</li>
                        <li style="margin-bottom: 8px;"><strong style="color: #ff9800;">SearXNG:</strong> Often outperforms commercial APIs by aggregating multiple sources - shared resource, use moderate example counts</li>
                        <li style="margin-bottom: 8px;"><strong style="color: #f44336;">Specialized engines (ArXiv, PubMed, Wikipedia):</strong> Shared resources that are useless for general SimpleQA questions - should not be used for this test</li>
                    </ul>
                    <div style="background: rgba(33, 150, 243, 0.1); border: 1px solid rgba(33, 150, 243, 0.3); padding: 12px; border-radius: 6px; margin-top: 12px;">
                        <p style="margin: 0; font-size: 0.9rem; color: #e0e0e0;">
                            <strong style="color: #2196f3;">🔧 For Shared Resources:</strong> When using SearXNG or other shared engines, reduce iterations and questions per iteration in Settings to minimize load on shared infrastructure.
                        </p>
                    </div>
                </div>
            </div>
            <div class="guidelines-sidebar">
                <div style="font-size: 2.5rem; color: var(--primary-color); margin-bottom: 12px;">
                    <i class="fas fa-tachometer-alt"></i>
                </div>
                <div style="font-size: 1.2rem; font-weight: 600; color: #e0e0e0; margin-bottom: 8px;">
                    Quick Check
                </div>
                <div style="font-size: 0.9rem; color: #a0a0a0; line-height: 1.4; margin-bottom: 15px;">
                    Test your config with reasonable limits
                </div>
                <div style="background: rgba(var(--primary-color-rgb), 0.2); padding: 8px 12px; border-radius: 6px; font-size: 0.85rem; color: var(--primary-color); font-weight: 500;">
                    🎯 Configuration Testing
                </div>
            </div>
        </div>
    </div>

    <!-- Alert container -->
    <div id="benchmark-alert" class="settings-alert-container" style="display:none"></div>

    <div class="card benchmark-card">
        <div class="card-content">
            <form id="benchmark-form">

                <!-- Benchmark Name -->
                <div class="form-group">
                    <label for="run_name">Benchmark Name (Optional)</label>
                    <input type="text" id="run_name" name="run_name" class="form-control" placeholder="e.g., 'Test new search strategy'">
                    <span class="input-help">Give your benchmark run a descriptive name</span>
                </div>

                <!-- Dataset Configuration -->
                <div class="form-group">
                    <fieldset>
                        <legend>Dataset Selection</legend>

                        <!-- SimpleQA Dataset -->
                        <div class="dataset-config">
                            <div class="dataset-header">
                                <div>
                                    <h3>SimpleQA</h3>
                                    <p>Fact-based questions with clear answers</p>
                                </div>
                                <div class="dataset-toggle">
                                    <input type="checkbox" id="simpleqa_enabled" checked>
                                    <label for="simpleqa_enabled">Enable</label>
                                </div>
                            </div>
                            <div class="form-group">
                                <label for="simpleqa_count">Number of Examples</label>
                                <input type="number" id="simpleqa_count" name="simpleqa_count" value="50" min="1" max="500" class="form-control">
                                <span class="input-help">Recommended: 50 examples provides good balance for configuration testing</span>
                            </div>
                        </div>

                        <!-- BrowseComp Dataset -->
                        <div class="dataset-config" style="border: 2px solid #f44336; background: #2a1e1e;">
                            <div class="dataset-header">
                                <div>
                                    <h3 style="color: #f44336;">BrowseComp</h3>
                                    <p style="color: #ccc;">Complex browsing and comparison tasks</p>
                                    <div style="background: #3a1e1e; border: 1px solid #f44336; color: #f44336; padding: 10px 12px; border-radius: 4px; margin-top: 10px; font-size: 0.85rem; line-height: 1.4;">
                                        <i class="fas fa-exclamation-triangle"></i> <strong>Poor Performance Warning:</strong> We currently achieve close to 0% accuracy on BrowseComp.
                                        <br><strong>For testing only:</strong> Limited to 20 examples max to see what this benchmark is about.
                                    </div>
                                </div>
                                <div class="dataset-toggle">
                                    <input type="checkbox" id="browsecomp_enabled">
                                    <label for="browsecomp_enabled">Enable (Testing Only)</label>
                                </div>
                            </div>
                            <div class="form-group">
                                <label for="browsecomp_count">Number of Examples (Max 20)</label>
                                <input type="number" id="browsecomp_count" name="browsecomp_count" value="0" min="0" max="20" class="form-control" disabled>
                                <span class="input-help" style="color: #f44336;">Restricted to max 20 examples due to poor performance - for curiosity testing only</span>
                            </div>
                        </div>
                    </fieldset>
                </div>

                <!-- Current Database Settings -->
                <div class="form-group">
                    <fieldset>
                        <legend>Current Configuration</legend>
                        <div class="dataset-config" style="background: var(--bg-color); border: 1px solid var(--border-color);">
                            <div class="dataset-header">
                                <div>
                                    <h3>Active Database Settings</h3>
                                    <p>Benchmark will use all settings from your database configuration</p>
                                </div>
                            </div>

                            <div id="current-settings-display" style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin-bottom: 15px;">
                                <div class="metric-card" style="text-align: left; padding: 10px;">
                                    <div class="metric-label">Provider</div>
                                    <div class="metric-value" style="font-size: 0.9rem;" id="current-provider">Loading...</div>
                                </div>
                                <div class="metric-card" style="text-align: left; padding: 10px;">
                                    <div class="metric-label">Model</div>
                                    <div class="metric-value" style="font-size: 0.9rem;" id="current-model">Loading...</div>
                                </div>
                                <div class="metric-card" style="text-align: left; padding: 10px;">
                                    <div class="metric-label">Search Tool</div>
                                    <div class="metric-value" style="font-size: 0.9rem;" id="current-search-tool">Loading...</div>
                                </div>
                                <div class="metric-card" style="text-align: left; padding: 10px;">
                                    <div class="metric-label">Iterations</div>
                                    <div class="metric-value" style="font-size: 0.9rem;" id="current-iterations">Loading...</div>
                                </div>
                                <div class="metric-card" style="text-align: left; padding: 10px;">
                                    <div class="metric-label">Questions/Iter</div>
                                    <div class="metric-value" style="font-size: 0.9rem;" id="current-questions">Loading...</div>
                                </div>
                                <div class="metric-card" style="text-align: left; padding: 10px;">
                                    <div class="metric-label">Strategy</div>
                                    <div class="metric-value" style="font-size: 0.9rem;" id="current-strategy">Loading...</div>
                                </div>
                            </div>

                            <div style="font-size: 0.9rem; color: var(--text-muted); text-align: center;">
                                <i class="fas fa-info-circle"></i> To change any settings, go to <a href="/research/" target="_blank" style="color: var(--primary-color);">Settings Dashboard</a>
                            </div>
                        </div>
                    </fieldset>
                </div>

                <!-- Evaluation Model Settings -->
                <div class="form-group">
                    <fieldset>
                        <legend>Evaluation Model Settings</legend>
                        <div class="dataset-config" style="background: var(--bg-color); border: 1px solid var(--border-color);">
                            <div class="dataset-header">
                                <div>
                                    <h3>Benchmark Evaluation Configuration</h3>
                                    <p>Configure the model used to grade benchmark results</p>
                                </div>
                            </div>
                    <div class="form-row">
                        <!-- Evaluation Provider Selection -->
                        <div class="form-group half">
                            <label for="evaluation_provider">Evaluation Provider</label>
                            <select id="evaluation_provider" name="evaluation_provider" class="form-control" data-initial-value="{{ eval_settings.evaluation_provider }}">
                                <option value="">Loading providers...</option>
                            </select>
                            <span class="input-help">Provider for the evaluation model</span>
                        </div>

                        <!-- Evaluation Model Selection -->
                        <div class="form-group half">
                            {{ render_dropdown(
                                input_id="evaluation_model",
                                dropdown_id="evaluation-model-dropdown",
                                placeholder="Enter or select evaluation model",
                                label="Evaluation Model",
                                help_text="Model to grade benchmark results",
                                allow_custom=true,
                                show_refresh=true,
                                refresh_aria_label="Refresh evaluation model list",
                                data_initial_value=eval_settings.evaluation_model
                            ) }}
                        </div>
                    </div>

                    <div class="form-row">
                        <!-- Evaluation Endpoint URL -->
                        <div class="form-group half">
                            <label for="evaluation_endpoint_url">Endpoint URL</label>
                            <input type="text" id="evaluation_endpoint_url" name="evaluation_endpoint_url" class="form-control" placeholder="https://openrouter.ai/api/v1" value="{{ eval_settings.evaluation_endpoint_url }}">
                            <span class="input-help">API endpoint for evaluation model</span>
                        </div>

                        <!-- Evaluation Temperature -->
                        <div class="form-group half">
                            <label for="evaluation_temperature">Temperature</label>
                            <input type="range" id="evaluation_temperature" name="evaluation_temperature" class="form-control" min="0" max="1" step="0.1" value="{{ eval_settings.evaluation_temperature }}">
                            <span class="input-help">0 recommended for consistent evaluation</span>
                        </div>
                    </div>

                            <div class="alert" style="background: rgba(33, 150, 243, 0.1); border: 1px solid rgba(33, 150, 243, 0.3); color: #ffffff; padding: 15px; border-radius: 6px; margin-top: 15px;">
                                <i class="fas fa-info-circle" style="color: #2196f3; margin-right: 8px;"></i>
                                <strong style="color: #2196f3;">Evaluation Model Selection:</strong>
                                For accurate benchmark grading, use flagship models from major providers like Claude Sonnet series or GPT-4 class models.
                                Local models and smaller cloud models may produce inconsistent evaluations, affecting benchmark accuracy scores.
                                However, preliminary tests indicate that local models might be adequate for performance evaluation if highest grade standards are not required.
                            </div>
                        </div>
                    </fieldset>
                </div>

                <!-- Search Engine Warning -->
                <div class="form-group" id="search-engine-warning" style="display: none;">
                    <div class="alert" style="background: #2d1b1b; border: 1px solid #f44336; color: #ffffff; padding: 15px; border-radius: 6px;">
                        <i class="fas fa-exclamation-triangle" style="color: #f44336; margin-right: 8px;"></i>
                        <strong style="color: #f44336;">Search Engine Notice:</strong>
                        <span id="search-warning-text" style="color: #ffffff;"></span>
                    </div>
                </div>

                <!-- Configuration Summary -->
                <div class="form-group">
                    <div id="config-summary" class="metric-card">
                        <div class="metric-value" id="total-examples">50</div>
                        <div class="metric-label">Total Examples</div>
                        <div style="margin-top: 10px; font-size: 0.9rem; color: var(--text-muted);">
                            Estimated time: <span id="estimated-time">40-60 minutes</span>
                        </div>
                    </div>
                </div>


                <!-- Action Buttons -->
                <div class="form-actions">
                    <button type="button" id="validate-config-btn" class="btn btn-secondary">
                        <i class="fas fa-check-circle"></i> Validate Configuration
                    </button>
                    <button type="submit" id="start-benchmark-btn" class="btn btn-primary">
                        <i class="fas fa-play"></i> Start Benchmark
                    </button>
                </div>
            </form>
        </div>
    </div>

    <!-- Progress Panel - Reusing research progress component -->
    <div id="benchmark-progress" class="benchmark-progress">
        <div class="card benchmark-card">
            <div class="card-content">
                <div class="progress-info">
                    <div class="current-query-container">
                        <div class="current-query-label">Current Benchmark:</div>
                        <div id="current-benchmark" class="current-query"></div>
                    </div>
                    <div class="progress-container">
                        <div class="progress-bar">
                            <div id="progress-bar" class="progress-fill"></div>
                        </div>
                        <div id="progress-percentage" class="progress-percentage">0%</div>
                    </div>
                    <div class="status-container">
                        <div class="status-label">Status:</div>
                        <div id="status-text" class="status-indicator">Initializing</div>
                    </div>
                    <div class="task-container">
                        <div class="task-label">Current Task:</div>
                        <div id="current-task" class="task-text">Starting benchmark...</div>
                    </div>

                    <!-- Benchmark-specific metrics -->
                    <div class="metrics-grid">
                        <div class="metric-card">
                            <div class="metric-value" id="overall-accuracy">--%</div>
                            <div class="metric-label">Overall Accuracy</div>
                            <div class="metric-subtitle" id="accuracy-confidence" style="font-size: 0.8rem; color: var(--text-muted); margin-top: 2px;">--</div>
                        </div>
                        <div class="metric-card">
                            <div class="metric-value" id="estimated-time">--</div>
                            <div class="metric-label">Est. Time Left</div>
                            <div class="metric-subtitle" id="elapsed-time" style="font-size: 0.8rem; color: var(--text-muted); margin-top: 2px;">--</div>
                        </div>
                        <div class="metric-card">
                            <div class="metric-value" id="completed-count">0</div>
                            <div class="metric-label">Completed</div>
                        </div>
                        <div class="metric-card">
                            <div class="metric-value" id="processing-rate">--</div>
                            <div class="metric-label">Avg Time/Example</div>
                        </div>
                    </div>

                    <!-- SearXNG Rate Limiting Warning in Progress -->
                    <div id="rate-limit-warning" class="alert alert-warning" style="margin-top: 15px; margin-bottom: 15px; display: none;">
                        <i class="fas fa-exclamation-triangle"></i>
                        <strong>Rate Limiting Detected!</strong> SearXNG is blocking requests due to too many parallel searches.
                        <br><small><strong>Quick fix:</strong> <code>docker restart searxng</code> or wait 5-10 minutes for limits to reset.</small>
                        <br><small><strong>Prevention:</strong> Reduce iterations/questions per iteration in Settings.</small>
                    </div>

                    <div id="dataset-accuracies">
                        <div class="dataset-accuracy">
                            <span>SimpleQA: <strong id="simpleqa-accuracy">--%</strong></span>
                            <span>BrowseComp: <strong id="browsecomp-accuracy">--%</strong></span>
                        </div>
                    </div>

                    <!-- Benchmark Control Actions -->
                    <div class="progress-actions" style="margin: 20px 0; text-align: center;">
                        <button id="cancel-benchmark-btn" class="btn btn-outline terminate-btn">
                            <i class="fas fa-stop-circle"></i> Cancel Benchmark
                        </button>
                        <button id="view-results-btn" class="btn btn-primary" style="display: none;">
                            <i class="fas fa-eye"></i> View Results
                        </button>
                    </div>

                    <!-- Current Question Display -->
                    <div id="current-question-section" class="benchmark-section" style="margin-top: 20px;">
                        <h4 style="color: var(--primary-color); margin-bottom: 15px;">
                            <i class="fas fa-question-circle"></i> Current Question
                        </h4>
                        <div id="current-question-card" class="question-card">
                            <div class="question-content">
                                <div class="question-text" id="current-question-text">No question being processed...</div>
                                <div class="question-meta">
                                    <span class="dataset-badge" id="current-dataset">--</span>
                                    <span class="example-id" id="current-example-id">--</span>
                                </div>
                            </div>
                            <div class="processing-status" id="current-processing-status">
                                <i class="fas fa-clock"></i> Waiting for benchmark to start...
                            </div>
                        </div>
                    </div>

                    <!-- Performance Charts -->
                    <div id="performance-charts-section" class="charts-section" style="display: none;">
                        <h4 style="color: var(--primary-color); margin-bottom: 15px;">
                            <i class="fas fa-chart-line"></i> Performance Analysis
                        </h4>
                        <div class="charts-grid">
                            <div class="chart-container">
                                <div class="chart-title">Accuracy Trend</div>
                                <canvas id="accuracy-chart" class="chart-canvas"></canvas>
                            </div>
                            <div class="chart-container">
                                <div class="chart-title">Processing Time per Example</div>
                                <canvas id="timing-chart" class="chart-canvas"></canvas>
                            </div>
                        </div>
                        <div class="charts-grid" style="margin-top: 20px;">
                            <div class="chart-container">
                                <div class="chart-title">Search Results Count</div>
                                <canvas id="search-results-chart" class="chart-canvas"></canvas>
                            </div>
                            <div class="chart-container">
                                <div class="chart-title">Search Quality Alert</div>
                                <div id="search-quality-status" style="padding: 20px; text-align: center; color: #e0e0e0;">
                                    <div id="search-status-icon" style="font-size: 2rem; margin-bottom: 10px;">
                                        <i class="fas fa-search"></i>
                                    </div>
                                    <div id="search-status-text" style="font-size: 1.1rem; margin-bottom: 10px;">
                                        Waiting for data...
                                    </div>
                                    <div id="search-status-details" style="font-size: 0.9rem; color: #a0a0a0;">
                                        Search result monitoring will begin when benchmark starts
                                    </div>
                                </div>
                            </div>
                        </div>
                    </div>

                    <!-- All Results Display -->
                    <div id="recent-results-section" class="benchmark-section" style="margin-top: 20px;">
                        <h4 style="color: var(--primary-color); margin-bottom: 15px;">
                            <i class="fas fa-history"></i> All Results
                        </h4>
                        <div id="recent-results-container">
                            <div class="no-results">No results yet...</div>
                        </div>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

<script>
// Benchmark configuration and progress tracking
let currentBenchmarkId = null;
let progressInterval = null;

// Global variables for evaluation settings data
let evalProviderData = null;
let evalModelData = null;
let evalTempData = null;
let evalEndpointData = null;


// Charts for performance tracking
let accuracyChart = null;
let timingChart = null;
let searchResultsChart = null;
let chartData = {
    examples: [],
    accuracies: [],
    processingTimes: [],
    timestamps: [],
    searchResultCounts: []
};

// Search quality monitoring
let recentSearchCounts = [];
let searchQualityAlert = false;

document.addEventListener('DOMContentLoaded', function() {
    // Initialize socket service first - but don't let it keep retrying if it fails
    if (window.socket && typeof window.socket.initializeSocket === 'function') {
        try {
            window.socket.initializeSocket();
        } catch (e) {
            console.warn('Socket initialization failed, continuing without real-time updates');
        }
    }

    initializeBenchmarkForm();
    initializeEvaluationSettings();
    loadCurrentSettings();
    updateConfigSummary();
    checkForRunningBenchmark();
});

function initializeBenchmarkForm() {
    const form = document.getElementById('benchmark-form');
    const validateBtn = document.getElementById('validate-config-btn');
    const cancelBtn = document.getElementById('cancel-benchmark-btn');

    // Form submission
    form.addEventListener('submit', function(e) {
        e.preventDefault();
        startBenchmark();
    });

    // Validate configuration
    validateBtn.addEventListener('click', validateConfiguration);

    // Cancel benchmark
    cancelBtn.addEventListener('click', cancelBenchmark);

    // Update summary when inputs change
    const inputs = form.querySelectorAll('input, select');
    inputs.forEach(input => {
        input.addEventListener('change', updateConfigSummary);
    });

    // Toggle dataset sections
    document.getElementById('simpleqa_enabled').addEventListener('change', function() {
        document.getElementById('simpleqa_count').disabled = !this.checked;
        updateConfigSummary();
    });

    // BrowseComp toggle - enable/disable count input when checkbox is toggled
    document.getElementById('browsecomp_enabled').addEventListener('change', function() {
        const countInput = document.getElementById('browsecomp_count');
        countInput.disabled = !this.checked;
        if (!this.checked) {
            countInput.value = 0; // Reset to 0 when disabled
        } else {
            countInput.value = 5; // Set a reasonable default when enabled
        }
        updateConfigSummary();
    });

}

function updateConfigSummary() {
    const simpleqaEnabled = document.getElementById('simpleqa_enabled').checked;
    const browsecompEnabled = document.getElementById('browsecomp_enabled').checked;
    const simpleqaCount = simpleqaEnabled ? parseInt(document.getElementById('simpleqa_count').value) || 0 : 0;
    const browsecompCount = browsecompEnabled ? parseInt(document.getElementById('browsecomp_count').value) || 0 : 0;

    const totalExamples = simpleqaCount + browsecompCount;
    document.getElementById('total-examples').textContent = totalExamples;

    // Estimate time (roughly 1-2 minutes per example)
    const estimatedMinutes = Math.round(totalExamples * 1.5);
    const estimatedTime = estimatedMinutes < 60 ?
        `${estimatedMinutes} minutes` :
        `${Math.round(estimatedMinutes/60)} hour${estimatedMinutes >= 120 ? 's' : ''}`;
    document.getElementById('estimated-time').textContent = estimatedTime;
}

function validateConfiguration() {
    const config = getConfigurationData();

    fetch('/benchmark/api/validate-config', {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json',
        },
        body: JSON.stringify(config)
    })
    .then(response => response.json())
    .then(data => {
        if (data.valid) {
            showAlert('Configuration is valid! Ready to start benchmark.', 'success');
        } else {
            showAlert('Configuration errors: ' + data.errors.join(', '), 'error');
        }
    })
    .catch(error => {
        console.error('Validation error:', error);
        showAlert('Error validating configuration: ' + error.message, 'error');
    });
}

function getConfigurationData() {
    const simpleqaEnabled = document.getElementById('simpleqa_enabled').checked;
    const browsecompEnabled = document.getElementById('browsecomp_enabled').checked;

    const datasets_config = {};
    if (simpleqaEnabled) {
        datasets_config.simpleqa = {
            count: parseInt(document.getElementById('simpleqa_count').value) || 0
        };
    }
    if (browsecompEnabled) {
        datasets_config.browsecomp = {
            count: parseInt(document.getElementById('browsecomp_count').value) || 0
        };
    }

    return {
        run_name: document.getElementById('run_name').value,
        datasets_config: datasets_config
        // All other config will be taken from database by the backend
    };
}

function startBenchmark() {
    const config = getConfigurationData();

    // Disable form
    document.getElementById('benchmark-form').style.display = 'none';
    document.getElementById('benchmark-progress').style.display = 'block';

    fetch('/benchmark/api/start', {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json',
        },
        body: JSON.stringify(config)
    })
    .then(response => response.json())
    .then(data => {
        if (data.success) {
            currentBenchmarkId = data.benchmark_run_id;
            showAlert('Benchmark started successfully!', 'success');
            startProgressTracking();
        } else {
            showAlert('Error starting benchmark: ' + data.error, 'error');
            resetForm();
        }
    })
    .catch(error => {
        console.error('Start error:', error);
        showAlert('Error starting benchmark: ' + error.message, 'error');
        resetForm();
    });
}

function startProgressTracking() {
    if (!currentBenchmarkId) return;

    // Initialize charts
    initializeCharts();

    // Show charts section
    document.getElementById('performance-charts-section').style.display = 'block';

    // Load historical data if reconnecting to running benchmark
    setTimeout(() => {
        loadHistoricalChartData();
    }, 1000);

    progressInterval = setInterval(() => {
        updateBenchmarkProgress();
    }, 3000); // Update every 3 seconds (reduced from 2 for better performance)

    // Initialize socket if not already done
    if (!window.socket || !window.socket.initializeSocket) {
        console.log('Socket service not available');
    } else if (!window.socket.socket) {
        console.log('Socket not initialized, initializing now...');
        window.socket.initializeSocket();
    }

    // Connect to WebSocket for detailed progress updates (reuse socket service)
    setTimeout(() => {
        if (window.socket && typeof window.socket.subscribeToResearch === 'function') {
            console.log('Subscribing to benchmark progress for ID:', currentBenchmarkId);
            // Subscribe to benchmark progress events using research subscription (same format)
            window.socket.subscribeToResearch(currentBenchmarkId, (eventData) => {
                handleDetailedProgress(eventData);
            });
        } else {
            console.warn('Socket service not available, falling back to polling only');
        }
    }, 500); // Small delay to ensure socket is ready
}

function handleDetailedProgress(data) {
    // Update current task display
    const currentTask = document.getElementById('current-task');
    if (currentTask && data.status) {
        currentTask.textContent = `Example ${data.example_id}: ${data.status}`;
    }

}

// Track last update times to avoid too frequent updates
let lastResultsUpdate = 0;
let lastChartsUpdate = 0;

function updateBenchmarkProgress() {
    if (!currentBenchmarkId) return;

    fetch(`/benchmark/api/status/${currentBenchmarkId}`)
    .then(response => response.json())
    .then(data => {
        if (data.success) {
            const status = data.status;
            updateProgressDisplay(status);

            // Update question/answer displays
            updateCurrentQuestion(status);

            // Only update results every 10 seconds to avoid performance issues
            const now = Date.now();
            if (now - lastResultsUpdate > 10000) {
                updateRecentResults();
                lastResultsUpdate = now;
            }

            // Update charts every 5 seconds
            if (now - lastChartsUpdate > 5000) {
                updateCharts(status);
                lastChartsUpdate = now;
            }

            // Update search result monitoring
            updateSearchQualityMonitoring();

            // Update rate limiting status
            updateRateLimitingStatus();

            if (status.status === 'completed' || status.status === 'failed' || status.status === 'cancelled') {
                clearInterval(progressInterval);
                progressInterval = null;

                if (status.status === 'completed') {
                    showAlert('Benchmark completed successfully!', 'success');
                } else {
                    showAlert(`Benchmark ${status.status}: ${status.error_message || ''}`, 'error');
                }
            }
        }
    })
    .catch(error => {
        console.error('Progress update error:', error);
    });
}

function updateProgressDisplay(status) {
    const percentage = status.total_examples > 0 ?
        (status.completed_examples / status.total_examples * 100) : 0;

    // Update progress bar (using research progress component IDs)
    const progressBar = document.getElementById('progress-bar');
    const progressPercentage = document.getElementById('progress-percentage');
    const statusText = document.getElementById('status-text');
    const currentTask = document.getElementById('current-task');
    const currentBenchmark = document.getElementById('current-benchmark');

    if (progressBar) progressBar.style.width = percentage + '%';
    if (progressPercentage) progressPercentage.textContent = Math.round(percentage) + '%';
    if (statusText) statusText.textContent = status.status || 'Running';
    if (currentTask) currentTask.textContent = `Processing example ${status.completed_examples} of ${status.total_examples}`;
    if (currentBenchmark && status.run_name) currentBenchmark.textContent = status.run_name;

    // Update benchmark-specific metrics
    const overallAccuracy = document.getElementById('overall-accuracy');
    const accuracyConfidence = document.getElementById('accuracy-confidence');
    const estimatedTime = document.getElementById('estimated-time');
    const elapsedTime = document.getElementById('elapsed-time');
    const processingRate = document.getElementById('processing-rate');
    const completedCount = document.getElementById('completed-count');

    // Overall accuracy with confidence interval
    if (overallAccuracy) overallAccuracy.textContent =
        status.overall_accuracy ? status.overall_accuracy.toFixed(1) + '%' : '--%';

    if (accuracyConfidence && status.accuracy_confidence) {
        const conf = status.accuracy_confidence;
        accuracyConfidence.textContent =
            `±${conf.margin_of_error.toFixed(1)}% (95% CI, n=${conf.sample_size})`;
    } else if (accuracyConfidence) {
        accuracyConfidence.textContent = '--';
    }

    // Time estimates
    if (estimatedTime && status.estimated_time_remaining) {
        const minutes = Math.round(status.estimated_time_remaining / 60);
        estimatedTime.textContent = minutes > 0 ? `${minutes}m` : '<1m';
    } else if (estimatedTime) {
        estimatedTime.textContent = '--';
    }

    if (elapsedTime && status.total_elapsed_time) {
        const minutes = Math.round(status.total_elapsed_time / 60);
        elapsedTime.textContent = `${minutes}m elapsed`;
    } else if (elapsedTime) {
        elapsedTime.textContent = '--';
    }

    // Average processing time per example
    if (processingRate && status.avg_time_per_example) {
        const avgMinutes = (status.avg_time_per_example / 60).toFixed(1);
        processingRate.textContent = `${avgMinutes}m`;
    } else if (processingRate) {
        processingRate.textContent = '--';
    }

    if (completedCount) completedCount.textContent = status.completed_examples;

    // Update per-dataset accuracy displays
    const simpleqaAccuracy = document.getElementById('simpleqa-accuracy');
    const browsecompAccuracy = document.getElementById('browsecomp-accuracy');

    if (simpleqaAccuracy) simpleqaAccuracy.textContent =
        status.simpleqa_accuracy ? status.simpleqa_accuracy.toFixed(1) + '%' : '--%';
    if (browsecompAccuracy) browsecompAccuracy.textContent =
        status.browsecomp_accuracy ? status.browsecomp_accuracy.toFixed(1) + '%' : '--%';
}

function cancelBenchmark() {
    if (!currentBenchmarkId) return;

    fetch(`/benchmark/api/cancel/${currentBenchmarkId}`, {
        method: 'POST'
    })
    .then(response => response.json())
    .then(data => {
        if (data.success) {
            showAlert('Benchmark cancelled successfully.', 'info');
            clearInterval(progressInterval);
            progressInterval = null;
            resetForm();
        } else {
            showAlert('Error cancelling benchmark: ' + data.error, 'error');
        }
    })
    .catch(error => {
        console.error('Cancel error:', error);
        showAlert('Error cancelling benchmark: ' + error.message, 'error');
    });
}

function resetForm() {
    document.getElementById('benchmark-form').style.display = 'block';
    document.getElementById('benchmark-progress').style.display = 'none';
    document.getElementById('performance-charts-section').style.display = 'none';
    currentBenchmarkId = null;

    // Clear any running intervals
    if (progressInterval) {
        clearInterval(progressInterval);
        progressInterval = null;
    }

    // Reset loading flags
    window.modelsLoading = false;

    // Reset chart data
    chartData = {
        examples: [],
        accuracies: [],
        processingTimes: [],
        timestamps: [],
        searchResultCounts: []
    };

    // Reset search quality monitoring
    recentSearchCounts = [];
    searchQualityAlert = false;

    // Destroy existing charts
    if (accuracyChart) {
        accuracyChart.destroy();
        accuracyChart = null;
    }
    if (timingChart) {
        timingChart.destroy();
        timingChart = null;
    }
    if (searchResultsChart) {
        searchResultsChart.destroy();
        searchResultsChart = null;
    }

    // Unsubscribe from socket events if connected
    if (window.socket && window.socket.unsubscribeFromResearch) {
        window.socket.unsubscribeFromResearch(currentBenchmarkId);
    }
}

function showAlert(message, type) {
    const alertContainer = document.getElementById('benchmark-alert');
    alertContainer.innerHTML = `
        <div class="settings-alert alert-${type}">
            <span>${message}</span>
            <button type="button" class="close-alert" onclick="this.parentElement.parentElement.style.display='none'">
                <i class="fas fa-times"></i>
            </button>
        </div>
    `;
    alertContainer.style.display = 'block';

    // Auto-hide success messages
    if (type === 'success') {
        setTimeout(() => {
            alertContainer.style.display = 'none';
        }, 5000);
    }
}

function checkForRunningBenchmark() {
    // Check if there's a running benchmark when page loads
    fetch('/benchmark/api/running')
    .then(response => response.json())
    .then(data => {
        if (data.success && data.benchmark_run_id) {
            currentBenchmarkId = data.benchmark_run_id;
            showAlert(`Reconnected to running benchmark #${currentBenchmarkId}`, 'info');

            // Show progress panel and hide form
            document.getElementById('benchmark-form').style.display = 'none';
            document.getElementById('benchmark-progress').style.display = 'block';

            // Start tracking progress
            startProgressTracking();
        }
    })
    .catch(error => {
        console.log('No running benchmark found (this is normal)');
    });
}

// Load current settings from database and display them
async function loadCurrentSettings() {
    console.log('Starting loadCurrentSettings...');

    try {
        // Load settings individually
        const [
            llmProviderResp,
            llmModelResp,
            searchToolResp,
            iterationsResp,
            questionsResp,
            strategyResp,
            evalProviderResp,
            evalModelResp,
            evalTempResp,
            evalEndpointResp
        ] = await Promise.all([
            fetch('/settings/api/llm.provider'),
            fetch('/settings/api/llm.model'),
            fetch('/settings/api/search.tool'),
            fetch('/settings/api/search.iterations'),
            fetch('/settings/api/search.questions_per_iteration'),
            fetch('/settings/api/search.search_strategy'),
            fetch('/settings/api/benchmark.evaluation.provider'),
            fetch('/settings/api/benchmark.evaluation.model'),
            fetch('/settings/api/benchmark.evaluation.temperature'),
            fetch('/settings/api/benchmark.evaluation.endpoint_url')
        ]);

        // Parse responses
        const llmProviderData = await llmProviderResp.json();
        const llmModelData = await llmModelResp.json();
        const searchToolData = await searchToolResp.json();
        const iterationsData = await iterationsResp.json();
        const questionsData = await questionsResp.json();
        const strategyData = await strategyResp.json();
        evalProviderData = await evalProviderResp.json();
        evalModelData = await evalModelResp.json();
        evalTempData = await evalTempResp.json();
        evalEndpointData = await evalEndpointResp.json();

        // Display LLM settings with error handling
        try {
            const providerEl = document.getElementById('current-provider');
            const modelEl = document.getElementById('current-model');

            // Set provider
            if (llmProviderData && llmProviderData.settings && llmProviderData.settings.value) {
                const provider = llmProviderData.settings.value;
                if (providerEl) providerEl.textContent = provider ? provider.toUpperCase() : 'Not set';
            } else {
                if (providerEl) providerEl.textContent = 'Not set';
            }

            // Set model
            if (llmModelData && llmModelData.settings && llmModelData.settings.value) {
                const model = llmModelData.settings.value;
                if (modelEl) modelEl.textContent = model || 'Not set';
            } else {
                if (modelEl) modelEl.textContent = 'Not set';
            }
        } catch (e) {
            console.error('Error setting LLM display:', e);
        }

        // Display search tool and check for warnings
        if (searchToolData && searchToolData.settings && searchToolData.settings.value) {
            const searchTool = searchToolData.settings.value || 'Not set';
            document.getElementById('current-search-tool').textContent = searchTool;

            // Check for search engine warnings
            checkSearchEngineWarnings(searchTool);
        } else {
            document.getElementById('current-search-tool').textContent = 'Not set';
        }

        // Display search iterations
        if (iterationsData && iterationsData.settings && iterationsData.settings.value !== null) {
            document.getElementById('current-iterations').textContent =
                iterationsData.settings.value || '8';
        } else {
            document.getElementById('current-iterations').textContent = '8'; // default
        }

        // Display questions per iteration
        if (questionsData && questionsData.settings && questionsData.settings.value !== null) {
            document.getElementById('current-questions').textContent =
                questionsData.settings.value || '5';
        } else {
            document.getElementById('current-questions').textContent = '5'; // default
        }

        // Display search strategy
        if (strategyData && strategyData.settings && strategyData.settings.value) {
            document.getElementById('current-strategy').textContent =
                strategyData.settings.value || 'focused_iteration';
        } else {
            document.getElementById('current-strategy').textContent = 'focused_iteration'; // default
        }

        // Display evaluation settings (commented out as these elements don't exist)
        // TODO: Add evaluation settings display section if needed

        // For now, just log the evaluation settings
        console.log('Evaluation settings loaded:', {
            provider: evalProviderData?.settings?.value || 'openai_endpoint',
            model: evalModelData?.settings?.value || 'anthropic/claude-3.7-sonnet',
            temperature: evalTempData?.settings?.value || 0,
            endpoint: evalEndpointData?.settings?.value || 'https://openrouter.ai/api/v1'
        });

    } catch (error) {
        console.error('Error loading current settings:', error);
        console.error('Error details:', error.message);
        console.error('Error stack:', error.stack);

        // Set error text on all metric values
        document.querySelectorAll('#current-settings-display .metric-value').forEach(el => {
            el.textContent = 'Error loading';
        });

        showAlert('Could not load current settings. Check console for details.', 'warning');
    }
}

function updateCurrentQuestion(status) {
    const currentQuestionText = document.getElementById('current-question-text');
    const currentDataset = document.getElementById('current-dataset');
    const currentExampleId = document.getElementById('current-example-id');
    const currentProcessingStatus = document.getElementById('current-processing-status');

    if (status.status === 'in_progress') {
        currentProcessingStatus.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing...';
        currentProcessingStatus.className = 'processing-status processing';

        // Show progress info
        const progressText = `Processing example ${status.completed_examples + 1} of ${status.total_examples}`;
        currentQuestionText.textContent = progressText;
        currentDataset.textContent = 'Active';
        currentExampleId.textContent = `Example ${status.completed_examples + 1}`;
    } else if (status.status === 'completed') {
        currentProcessingStatus.innerHTML = '<i class="fas fa-check-circle"></i> Benchmark completed!';
        currentProcessingStatus.className = 'processing-status completed';
        currentQuestionText.textContent = 'All questions processed successfully.';
        currentDataset.textContent = 'Completed';
        currentExampleId.textContent = `${status.completed_examples}/${status.total_examples}`;
    } else {
        currentProcessingStatus.innerHTML = '<i class="fas fa-clock"></i> Waiting...';
        currentProcessingStatus.className = 'processing-status';
        currentQuestionText.textContent = 'No question being processed...';
        currentDataset.textContent = '--';
        currentExampleId.textContent = '--';
    }
}

// Cache last results to avoid unnecessary re-renders
let lastResultsData = null;

function updateRecentResults() {
    if (!currentBenchmarkId) return;

    fetch(`/benchmark/api/results/${currentBenchmarkId}?limit=50`) // Reduced from 100 for performance
    .then(response => response.json())
    .then(data => {
        if (data.success && data.results) {
            // Only update if data has changed
            const newResultsStr = JSON.stringify(data.results);
            const oldResultsStr = JSON.stringify(lastResultsData);

            if (newResultsStr !== oldResultsStr) {
                lastResultsData = data.results;
                displayRecentResults(data.results);
            }
        }
    })
    .catch(error => {
        console.error('Error fetching all results:', error);
    });
}

function getSearchCountClass(count) {
    if (count <= 1) return 'critical';
    if (count <= 4) return 'warning';
    return 'good';
}

function displayRecentResults(results) {
    const container = document.getElementById('recent-results-container');

    if (!results || results.length === 0) {
        container.innerHTML = '<div class="no-results">No results yet...</div>';
        return;
    }

    // Save expanded states before re-rendering
    const expandedStates = {};
    const allToggles = container.querySelectorAll('[id^="toggle-"]');
    allToggles.forEach(toggle => {
        const id = toggle.id.replace('toggle-', '');
        const fullTextElement = document.getElementById(`full-${id}`);
        if (fullTextElement && fullTextElement.style.display !== 'none') {
            expandedStates[id] = true;
        }
    });

    const resultsHtml = results.map((result, index) => {
        const statusClass = result.is_correct ? 'correct' : 'incorrect';
        const statusIcon = result.is_correct ? '<i class="fas fa-check-circle"></i>' : '<i class="fas fa-times-circle"></i>';
        const statusText = result.is_correct ? 'Correct' : 'Incorrect';

        // Function to create expandable text
        const createExpandableText = (text, id, maxLength = 200) => {
            if (!text) return 'No answer provided';

            if (text.length <= maxLength) return text;

            const truncated = text.substring(0, maxLength) + '...';
            const isExpanded = expandedStates[id] || false;

            return `
                <span id="truncated-${id}" style="display: ${isExpanded ? 'none' : 'inline'};">${truncated}</span>
                <span id="full-${id}" style="display: ${isExpanded ? 'inline' : 'none'};">${text}</span>
                <a href="#" onclick="toggleText('${id}'); return false;" id="toggle-${id}" style="color: #2196f3; font-size: 0.85rem; margin-left: 5px; text-decoration: underline;">${isExpanded ? 'Show less' : 'Show more'}</a>
            `;
        };

        return `
            <div class="result-card ${statusClass}">
                <div class="result-header">
                    <div>
                        <span class="dataset-badge">${result.dataset_type}</span>
                        <span class="example-id">${result.example_id}</span>
                        ${result.search_result_count !== undefined ?
                            `<span class="search-count-badge ${getSearchCountClass(result.search_result_count)}" title="Search results found">${result.search_result_count} results</span>` :
                            ''}
                    </div>
                    <span class="result-status ${statusClass}">
                        ${statusIcon} ${statusText}
                    </span>
                </div>
                <div class="question-text" style="margin-bottom: 10px; font-size: 0.9rem;">
                    ${result.question || 'No question provided'}
                </div>
                <div class="answer-comparison">
                    <div class="answer-box model-answer">
                        <div class="answer-label">Model Answer</div>
                        <div>${createExpandableText(result.model_answer || 'No answer provided', `model-${index}`)}</div>
                    </div>
                    <div class="answer-box correct-answer">
                        <div class="answer-label">Correct Answer</div>
                        <div>${createExpandableText(result.correct_answer || 'No correct answer available', `correct-${index}`)}</div>
                    </div>
                </div>
            </div>
        `;
    }).join('');

    container.innerHTML = resultsHtml;
}

// Toggle function for expandable text
function toggleText(id) {
    const truncated = document.getElementById(`truncated-${id}`);
    const full = document.getElementById(`full-${id}`);
    const toggle = document.getElementById(`toggle-${id}`);

    if (truncated.style.display === 'none') {
        truncated.style.display = 'inline';
        full.style.display = 'none';
        toggle.textContent = 'Show more';
    } else {
        truncated.style.display = 'none';
        full.style.display = 'inline';
        toggle.textContent = 'Show less';
    }
}

// Chart initialization and management
function initializeCharts() {
    const chartOptions = {
        responsive: true,
        maintainAspectRatio: false,
        plugins: {
            legend: {
                labels: {
                    color: '#e0e0e0'
                }
            }
        },
        scales: {
            x: {
                ticks: {
                    color: '#a0a0a0'
                },
                grid: {
                    color: '#333'
                }
            },
            y: {
                ticks: {
                    color: '#a0a0a0'
                },
                grid: {
                    color: '#333'
                }
            }
        }
    };

    // Accuracy Chart
    const accuracyCtx = document.getElementById('accuracy-chart').getContext('2d');
    accuracyChart = new Chart(accuracyCtx, {
        type: 'line',
        data: {
            labels: [],
            datasets: [{
                label: 'Overall Accuracy',
                data: [],
                borderColor: '#4caf50',
                backgroundColor: 'rgba(76, 175, 80, 0.1)',
                tension: 0.4,
                fill: true
            }, {
                label: 'SimpleQA Accuracy',
                data: [],
                borderColor: '#2196f3',
                backgroundColor: 'rgba(33, 150, 243, 0.1)',
                tension: 0.4,
                fill: false
            }, {
                label: 'BrowseComp Accuracy',
                data: [],
                borderColor: '#ff9800',
                backgroundColor: 'rgba(255, 152, 0, 0.1)',
                tension: 0.4,
                fill: false
            }]
        },
        options: {
            ...chartOptions,
            scales: {
                ...chartOptions.scales,
                y: {
                    ...chartOptions.scales.y,
                    min: 0,
                    max: 100,
                    ticks: {
                        ...chartOptions.scales.y.ticks,
                        callback: function(value) {
                            return value + '%';
                        }
                    }
                }
            }
        }
    });

    // Timing Chart
    const timingCtx = document.getElementById('timing-chart').getContext('2d');
    timingChart = new Chart(timingCtx, {
        type: 'line',
        data: {
            labels: [],
            datasets: [{
                label: 'Processing Time (seconds)',
                data: [],
                borderColor: '#e91e63',
                backgroundColor: 'rgba(233, 30, 99, 0.1)',
                tension: 0.4,
                fill: true
            }]
        },
        options: {
            ...chartOptions,
            scales: {
                ...chartOptions.scales,
                y: {
                    ...chartOptions.scales.y,
                    min: 0,
                    ticks: {
                        ...chartOptions.scales.y.ticks,
                        callback: function(value) {
                            return value + 's';
                        }
                    }
                }
            }
        }
    });

    // Search Results Chart
    const searchResultsCtx = document.getElementById('search-results-chart').getContext('2d');
    searchResultsChart = new Chart(searchResultsCtx, {
        type: 'line',
        data: {
            labels: [],
            datasets: [{
                label: 'Search Results Count',
                data: [],
                borderColor: '#9c27b0',
                backgroundColor: 'rgba(156, 39, 176, 0.1)',
                tension: 0.4,
                fill: true
            }]
        },
        options: {
            ...chartOptions,
            scales: {
                ...chartOptions.scales,
                y: {
                    ...chartOptions.scales.y,
                    min: 0,
                    ticks: {
                        ...chartOptions.scales.y.ticks,
                        callback: function(value) {
                            return Math.round(value) + ' results';
                        }
                    }
                }
            }
        }
    });
}

function updateCharts(status) {
    if (!accuracyChart || !timingChart || !searchResultsChart || !status) return;

    const currentExample = status.completed_examples;
    if (currentExample <= 0) return;

    // Update accuracy chart
    if (status.overall_accuracy !== undefined) {
        // Add new data point
        const labels = accuracyChart.data.labels;
        if (!labels.includes(currentExample)) {
            labels.push(currentExample);
            accuracyChart.data.datasets[0].data.push(status.overall_accuracy || 0);
            accuracyChart.data.datasets[1].data.push(status.simpleqa_accuracy || 0);
            accuracyChart.data.datasets[2].data.push(status.browsecomp_accuracy || 0);
        } else {
            // Update existing data point
            const index = labels.indexOf(currentExample);
            if (index >= 0) {
                accuracyChart.data.datasets[0].data[index] = status.overall_accuracy || 0;
                accuracyChart.data.datasets[1].data[index] = status.simpleqa_accuracy || 0;
                accuracyChart.data.datasets[2].data[index] = status.browsecomp_accuracy || 0;
            }
        }

        // Keep only last 50 data points
        if (labels.length > 50) {
            labels.shift();
            accuracyChart.data.datasets.forEach(dataset => dataset.data.shift());
        }

        accuracyChart.update('none');
    }

    // Update timing chart
    if (status.avg_time_per_example !== undefined) {
        const timingLabels = timingChart.data.labels;
        if (!timingLabels.includes(currentExample)) {
            timingLabels.push(currentExample);
            timingChart.data.datasets[0].data.push(status.avg_time_per_example || 0);
        } else {
            // Update existing data point
            const index = timingLabels.indexOf(currentExample);
            if (index >= 0) {
                timingChart.data.datasets[0].data[index] = status.avg_time_per_example || 0;
            }
        }

        // Keep only last 50 data points
        if (timingLabels.length > 50) {
            timingLabels.shift();
            timingChart.data.datasets[0].data.shift();
        }

        timingChart.update('none');
    }
}

// Load historical chart data when reconnecting to running benchmark
async function loadHistoricalChartData() {
    if (!currentBenchmarkId || !accuracyChart || !timingChart) return;

    try {
        // Get benchmark status to populate initial chart data
        const response = await fetch(`/benchmark/api/status/${currentBenchmarkId}`);
        const data = await response.json();

        if (data.success && data.status.completed_examples > 0) {
            // Create simulated historical data points for a smooth chart
            // In a real implementation, you'd store this data in the database
            const status = data.status;
            const totalCompleted = status.completed_examples;

            // Generate some sample points for the chart
            for (let i = 1; i <= totalCompleted; i += Math.max(1, Math.floor(totalCompleted / 20))) {
                accuracyChart.data.labels.push(i);
                // Use current accuracy as approximation (in real implementation, store historical values)
                accuracyChart.data.datasets[0].data.push(status.overall_accuracy || 0);
                accuracyChart.data.datasets[1].data.push(status.simpleqa_accuracy || 0);
                accuracyChart.data.datasets[2].data.push(status.browsecomp_accuracy || 0);

                timingChart.data.labels.push(i);
                timingChart.data.datasets[0].data.push(status.avg_time_per_example || 0);
            }

            accuracyChart.update();
            timingChart.update();
        }
    } catch (error) {
        console.error('Error loading historical chart data:', error);
    }
}

// Search quality monitoring functions
async function updateSearchQualityMonitoring() {
    if (!currentBenchmarkId) return;

    try {
        // Fetch recent results to get search counts
        const response = await fetch(`/benchmark/api/results/${currentBenchmarkId}?limit=5`);
        const data = await response.json();

        if (data.success && data.results && data.results.length > 0) {
            // Process search result counts (already calculated by backend)
            const recentResults = data.results;
            let totalSearchResults = 0;
            let validResults = 0;

            recentResults.forEach(result => {
                if (result.search_result_count !== undefined && result.search_result_count !== null) {
                    totalSearchResults += result.search_result_count;
                    validResults++;
                }
            });

            if (validResults > 0) {
                const avgSearchResults = totalSearchResults / validResults;
                updateSearchResultsChart(avgSearchResults);
                updateSearchQualityAlert(avgSearchResults);
            }
        }
    } catch (error) {
        console.error('Error updating search quality monitoring:', error);
    }
}

function updateSearchResultsChart(avgSearchResults) {
    if (!searchResultsChart || !currentBenchmarkId) return;

    // Get current timestamp or progress for x-axis
    const now = new Date().toLocaleTimeString();
    const labels = searchResultsChart.data.labels;

    // Always add new data point with timestamp
    labels.push(now);
    searchResultsChart.data.datasets[0].data.push(avgSearchResults);

    // Keep only last 20 data points for readability
    if (labels.length > 20) {
        labels.shift();
        searchResultsChart.data.datasets[0].data.shift();
    }

    // Store for alert monitoring
    recentSearchCounts.push(avgSearchResults);
    if (recentSearchCounts.length > 10) {
        recentSearchCounts.shift();
    }

    searchResultsChart.update('none');
}

function updateSearchQualityAlert(avgSearchResults) {
    const statusIcon = document.getElementById('search-status-icon');
    const statusText = document.getElementById('search-status-text');
    const statusDetails = document.getElementById('search-status-details');

    if (!statusIcon || !statusText || !statusDetails) return;

    // Determine alert level based on search result count
    let alertLevel = 'good';
    let alertMessage = '';
    let alertDetails = '';
    let alertIcon = 'fas fa-check-circle';
    let alertColor = '#4caf50';

    if (avgSearchResults < 2) {
        alertLevel = 'critical';
        alertMessage = 'CRITICAL: Very few search results';
        alertDetails = `Only ${avgSearchResults.toFixed(1)} results per query. Accuracy likely severely degraded.`;
        alertIcon = 'fas fa-exclamation-triangle';
        alertColor = '#f44336';

        // Show rate limit warning
        document.getElementById('rate-limit-warning').style.display = 'block';

    } else if (avgSearchResults < 5) {
        alertLevel = 'warning';
        alertMessage = 'WARNING: Low search results';
        alertDetails = `${avgSearchResults.toFixed(1)} results per query. Consider restarting SearXNG.`;
        alertIcon = 'fas fa-exclamation-circle';
        alertColor = '#ff9800';

    } else if (avgSearchResults < 10) {
        alertLevel = 'caution';
        alertMessage = 'CAUTION: Moderate search results';
        alertDetails = `${avgSearchResults.toFixed(1)} results per query. Performance may be affected.`;
        alertIcon = 'fas fa-info-circle';
        alertColor = '#2196f3';

    } else {
        alertLevel = 'good';
        alertMessage = 'GOOD: Healthy search results';
        alertDetails = `${avgSearchResults.toFixed(1)} results per query. Search engines working well.`;
        alertIcon = 'fas fa-check-circle';
        alertColor = '#4caf50';

        // Hide rate limit warning if it was shown
        document.getElementById('rate-limit-warning').style.display = 'none';
    }

    // Update UI
    statusIcon.innerHTML = `<i class="${alertIcon}"></i>`;
    statusIcon.style.color = alertColor;
    statusText.textContent = alertMessage;
    statusText.style.color = alertColor;
    statusDetails.textContent = alertDetails;

    // Trigger alert if we detect degradation
    if (alertLevel === 'critical' && !searchQualityAlert) {
        searchQualityAlert = true;
        showAlert('Search engine performance critically degraded! Consider restarting SearXNG.', 'error');
    } else if (alertLevel === 'warning' && !searchQualityAlert) {
        searchQualityAlert = true;
        showAlert('Search engine performance is declining. Monitor closely.', 'warning');
    } else if (alertLevel === 'good') {
        searchQualityAlert = false; // Reset alert flag when performance improves
    }
}

// Rate limiting status monitoring (simplified)
async function updateRateLimitingStatus() {
    try {
        const response = await fetch('/benchmark/api/search-quality');
        const data = await response.json();

        if (data.success && data.search_quality && data.search_quality.length > 0) {
            // Find SearXNG engine specifically (most critical for benchmarks)
            const searxngStats = data.search_quality.find(stat =>
                stat.engine_type.toLowerCase().includes('searxng')
            );

            if (searxngStats && searxngStats.recent_avg_results < 2) {
                // Show warning when search results are critically low
                console.warn('Low search results detected:', searxngStats);

                const statusDetails = document.getElementById('search-status-details');
                if (statusDetails && !statusDetails.textContent.includes('Very low results')) {
                    statusDetails.textContent += ` Very low results: ${searxngStats.recent_avg_results.toFixed(1)} avg.`;
                }
            }
        }
    } catch (error) {
        console.error('Error fetching rate limiting status:', error);
    }
}

// Check for search engine warnings and display appropriate messages
function checkSearchEngineWarnings(searchTool) {
    const warningContainer = document.getElementById('search-engine-warning');
    const warningText = document.getElementById('search-warning-text');

    let showWarning = false;
    let message = '';

    switch (searchTool?.toLowerCase()) {
        case 'searxng':
            showWarning = true;
            message = 'SearXNG is a shared resource. Please use reasonable example counts to avoid affecting other users. Consider shorter benchmarks for testing.';
            break;
        case 'arxiv':
            showWarning = true;
            message = 'ArXiv is a shared resource containing only academic papers - benchmarking with SimpleQA is useless as it will find zero relevant results for general knowledge questions. Should not be used for this test. Use Tavily instead.';
            break;
        case 'pubmed':
            showWarning = true;
            message = 'PubMed is a shared resource containing only medical literature - benchmarking with SimpleQA is absolutely useless as general knowledge questions will find zero relevant results. Should not be used for this test. Use Tavily instead.';
            break;
        case 'semanticscholar':
            showWarning = true;
            message = 'Semantic Scholar is a shared resource specialized for academic research - not suitable for general SimpleQA questions and should not be used for this test. Use Tavily instead.';
            break;
        case 'wikipedia':
            showWarning = true;
            message = 'Wikipedia is a shared resource with limited coverage - benchmarking with it is useless for comprehensive testing and should not be used for this test. Use Tavily instead.';
            break;
        default:
            showWarning = false;
    }

    if (showWarning) {
        warningText.textContent = message;
        warningContainer.style.display = 'block';
    } else {
        warningContainer.style.display = 'none';
    }
}

// ==============================================
// Evaluation Settings Functionality
// (Reusing research page model functionality)
// ==============================================

// Evaluation settings DOM elements
let evaluationProviderSelect = null;
let evaluationModelInput = null;
let evaluationEndpointInput = null;
let evaluationTemperatureInput = null;

function initializeEvaluationSettings() {
    console.log('Initializing evaluation settings...');

    // Initialize the global models object
    window.evaluationModels = window.evaluationModels || {};

    // Get DOM elements
    evaluationProviderSelect = document.getElementById('evaluation_provider');
    evaluationModelInput = document.getElementById('evaluation_model');
    evaluationEndpointInput = document.getElementById('evaluation_endpoint_url');
    evaluationTemperatureInput = document.getElementById('evaluation_temperature');

    console.log('DOM elements found:', {
        provider: !!evaluationProviderSelect,
        model: !!evaluationModelInput,
        endpoint: !!evaluationEndpointInput,
        temperature: !!evaluationTemperatureInput
    });

    // Populate evaluation provider dropdown
    populateEvaluationProviders();

    // Setup evaluation model dropdown using existing custom dropdown
    setupEvaluationModelDropdown();

    // Setup event handlers
    setupEvaluationEventHandlers();

    // Load models from API - this will populate window.evaluationModels
    loadEvaluationModelsFromAPI();

    // Load settings with a small delay to ensure DOM is ready
    setTimeout(() => {
        loadEvaluationSettings();
    }, 100);
}

function populateEvaluationProviders() {
    if (!evaluationProviderSelect) return;

    // Clear existing options
    evaluationProviderSelect.innerHTML = '';

    // Provider options (same as research page)
    const providers = [
        { value: 'ollama', label: 'Ollama (Local)' },
        { value: 'openai', label: 'OpenAI (Cloud)' },
        { value: 'anthropic', label: 'Anthropic (Cloud)' },
        { value: 'openai_endpoint', label: 'Custom OpenAI Endpoint' },
        { value: 'vllm', label: 'vLLM (Local)' },
        { value: 'lmstudio', label: 'LM Studio (Local)' },
        { value: 'llamacpp', label: 'Llama.cpp (Local)' }
    ];

    // Add options
    providers.forEach(provider => {
        const option = document.createElement('option');
        option.value = provider.value;
        option.textContent = provider.label;
        evaluationProviderSelect.appendChild(option);
    });

    // Set initial value from data attribute
    const initialProvider = evaluationProviderSelect.getAttribute('data-initial-value') || 'openai_endpoint';
    console.log('Setting initial evaluation provider to:', initialProvider);
    evaluationProviderSelect.value = initialProvider;

    // Show/hide endpoint field based on initial provider
    if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
        evaluationEndpointInput.parentNode.style.display =
            initialProvider === 'openai_endpoint' ? 'block' : 'none';
    }

    console.log('Populated evaluation providers with initial value:', initialProvider);
}

function setupEvaluationModelDropdown() {
    if (!evaluationModelInput) return;

    const dropdownList = document.getElementById('evaluation-model-dropdown-list');
    if (!dropdownList) return;

    // Setup custom dropdown using the existing component
    if (window.setupCustomDropdown) {
        window.evaluationDropdownInstance = window.setupCustomDropdown(
            evaluationModelInput,
            dropdownList,
            function() {
                // Get models dynamically based on current provider
                const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';

                // Use loaded models if available
                if (window.evaluationModels && window.evaluationModels[provider]) {
                    console.log(`Returning ${window.evaluationModels[provider].length} loaded models for ${provider}`);
                    return window.evaluationModels[provider];
                }

                // Otherwise return defaults
                return getEvaluationModelOptions();
            },
            function(value, item) {
                // On selection callback
                const hiddenInput = document.getElementById('evaluation_model_hidden');
                if (hiddenInput) {
                    hiddenInput.value = value;
                }
                saveEvaluationSetting('benchmark.evaluation.model', value);
            },
            true, // Allow custom values
            'No models available'
        );
    }

    // Setup refresh button
    const refreshBtn = document.querySelector('[data-target="evaluation-model-dropdown"] .refresh-btn');
    if (refreshBtn) {
        refreshBtn.addEventListener('click', function(e) {
            e.preventDefault();
            console.log('Refresh button clicked, force reloading models...');

            // Show loading state
            const icon = this.querySelector('i');
            if (icon) {
                icon.classList.add('fa-spin');
            }

            // Force reload models from API
            window.modelsLoading = false; // Reset the flag

            fetch('/settings/api/available-models?force_refresh=true')
                .then(response => response.json())
                .then(data => {
                    console.log('Force refresh received model data:', data);

                    if (data && data.providers) {
                        // Store the formatted models
                        window.evaluationModels = {};

                        // Process each provider's models
                        Object.entries(data.providers).forEach(([providerKey, models]) => {
                            if (Array.isArray(models)) {
                                // Map provider keys to expected provider names
                                let providerName = providerKey.replace('_models', '').toLowerCase();

                                // Special handling for openai_endpoint
                                if (providerName === 'openai_endpoint') {
                                    providerName = 'openai_endpoint';
                                }

                                window.evaluationModels[providerName] = models.map(model => ({
                                    value: model.value || model.id,
                                    label: model.label || model.name || model.value
                                }));
                                console.log(`Loaded ${models.length} models for ${providerName}`);
                            }
                        });

                        // Update dropdown with new data
                        refreshEvaluationModels();
                    }
                })
                .catch(error => {
                    console.error('Error loading evaluation models:', error);
                })
                .finally(() => {
                    // Remove loading state
                    if (icon) {
                        icon.classList.remove('fa-spin');
                    }
                    window.modelsLoading = false;
                });
        });
    }
}

function getEvaluationModelOptions() {
    const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
    console.log('Getting evaluation model options for provider:', provider);

    // Check if we have loaded models
    if (window.evaluationModels && window.evaluationModels[provider] && window.evaluationModels[provider].length > 0) {
        console.log(`Returning ${window.evaluationModels[provider].length} cached models for ${provider}`);
        return window.evaluationModels[provider];
    }

    // Load models from API if not already loading
    if (!window.modelsLoading) {
        window.modelsLoading = true;
        loadEvaluationModelsFromAPI();
    }

    // Return minimal defaults while loading
    console.log(`No models loaded yet for ${provider}, returning defaults`);
    if (provider === 'openai_endpoint') {
        return [
            { value: 'anthropic/claude-3.5-sonnet', label: 'Claude 3.5 Sonnet' },
            { value: 'openai/gpt-4o', label: 'GPT-4o' },
            { value: '01-ai/yi-large', label: 'Yi Large' }
        ];
    } else if (provider === 'openai') {
        return [
            { value: 'gpt-4o', label: 'GPT-4o' },
            { value: 'gpt-4', label: 'GPT-4' },
            { value: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo' }
        ];
    } else if (provider === 'anthropic') {
        return [
            { value: 'claude-3-5-sonnet-latest', label: 'Claude 3.5 Sonnet' },
            { value: 'claude-3-opus-20240229', label: 'Claude 3 Opus' }
        ];
    } else {
        // Return empty array for other providers
        return [];
    }
}

// Debounce function to prevent too many API calls
function debounce(func, wait) {
    let timeout;
    return function executedFunction(...args) {
        const later = () => {
            clearTimeout(timeout);
            func(...args);
        };
        clearTimeout(timeout);
        timeout = setTimeout(later, wait);
    };
}

// Debounced version of loadEvaluationModelsFromAPI
const loadEvaluationModelsFromAPI = debounce(function(forceRefresh = false) {
    console.log('Loading evaluation models from API...', forceRefresh ? '(force refresh)' : '');

    // Prevent multiple simultaneous loads
    if (window.modelsLoading && !forceRefresh) {
        console.log('Models already loading, skipping...');
        return;
    }

    window.modelsLoading = true;

    // Use the correct API endpoint with optional force_refresh parameter
    const url = forceRefresh ? '/settings/api/available-models?force_refresh=true' : '/settings/api/available-models';

    fetch(url)
        .then(response => response.json())
        .then(data => {
            console.log('Received model data:', data);

            if (data && data.providers) {
                // Store the formatted models in a temporary variable
                window.evaluationModels = {};

                // Process each provider's models
                Object.entries(data.providers).forEach(([providerKey, models]) => {
                    if (Array.isArray(models)) {
                        // Map provider keys to expected provider names
                        let providerName = providerKey.replace('_models', '').toLowerCase();

                        // Special handling for openai_endpoint
                        if (providerName === 'openai_endpoint') {
                            providerName = 'openai_endpoint';
                        }

                        window.evaluationModels[providerName] = models.map(model => ({
                            value: model.value || model.id,
                            label: model.label || model.name || model.value
                        }));
                        console.log(`Loaded ${models.length} models for ${providerName}`);
                    }
                });

                // Update dropdown with new data
                refreshEvaluationModels();
            }
        })
        .catch(error => {
            console.error('Error loading evaluation models:', error);
        })
        .finally(() => {
            window.modelsLoading = false;
        });
}, 500); // Wait 500ms before making the API call

function filterModelsForProvider(models, provider) {
    const providerUpper = provider.toUpperCase();

    let filtered = models.filter(model => {
        const modelProvider = (model.provider || '').toUpperCase();
        // Handle provider name variations
        if (providerUpper === 'OPENAI_ENDPOINT' && modelProvider === 'OPENAI_ENDPOINT') return true;
        if (providerUpper === 'OPENAI' && modelProvider === 'OPENAI') return true;
        if (providerUpper === 'ANTHROPIC' && modelProvider === 'ANTHROPIC') return true;
        if (providerUpper === 'OLLAMA' && modelProvider === 'OLLAMA') return true;
        return modelProvider === providerUpper;
    });

    // If no models found for provider, return some defaults
    if (filtered.length === 0) {
        if (providerUpper === 'OPENAI_ENDPOINT') {
            return [
                { value: '01-ai/yi-large', label: 'Yi Large' },
                { value: 'anthropic/claude-3.5-sonnet', label: 'Claude 3.5 Sonnet' },
                { value: 'openai/gpt-4o', label: 'GPT-4o' }
            ];
        } else if (providerUpper === 'OPENAI') {
            return [
                { value: 'gpt-4o', label: 'GPT-4o' },
                { value: 'gpt-4', label: 'GPT-4' },
                { value: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo' }
            ];
        } else if (providerUpper === 'ANTHROPIC') {
            return [
                { value: 'claude-3-5-sonnet-latest', label: 'Claude 3.5 Sonnet' },
                { value: 'claude-3-opus-20240229', label: 'Claude 3 Opus' }
            ];
        }
    }

    return filtered.map(model => ({
        value: model.value || model.id,
        label: model.label || model.name || model.value
    }));
}

function refreshEvaluationModels() {
    const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
    const options = window.evaluationModels && window.evaluationModels[provider] ?
        window.evaluationModels[provider] : [];

    console.log(`Refreshing evaluation dropdown with ${options.length} options for provider ${provider}`);

    // If we have the updateDropdownOptions function and the input
    if (window.updateDropdownOptions && evaluationModelInput) {
        // Update the dropdown with the actual loaded models
        window.updateDropdownOptions(evaluationModelInput, options);
    }

    // Force a click event to show the dropdown with new options
    if (evaluationModelInput && options.length > 0) {
        // Trigger a click to show the dropdown with updated options
        setTimeout(() => {
            evaluationModelInput.click();
        }, 100);
    }
}


function setupEvaluationEventHandlers() {
    // Provider change handler
    if (evaluationProviderSelect) {
        evaluationProviderSelect.addEventListener('change', function() {
            const provider = this.value;
            console.log('Evaluation provider changed to:', provider);

            // Show/hide endpoint URL field
            if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
                evaluationEndpointInput.parentNode.style.display =
                    provider === 'openai_endpoint' ? 'block' : 'none';
            }

            // Update model options for new provider
            refreshEvaluationModels();

            // Save provider setting
            saveEvaluationSetting('benchmark.evaluation.provider', provider);
        });
    }

    // Model input change handler
    if (evaluationModelInput) {
        evaluationModelInput.addEventListener('change', function() {
            saveEvaluationSetting('benchmark.evaluation.model', this.value);
        });
    }

    // Endpoint URL change handler
    if (evaluationEndpointInput) {
        evaluationEndpointInput.addEventListener('change', function() {
            saveEvaluationSetting('benchmark.evaluation.endpoint_url', this.value);
        });
    }

    // Temperature change handler
    if (evaluationTemperatureInput) {
        evaluationTemperatureInput.addEventListener('change', function() {
            saveEvaluationSetting('benchmark.evaluation.temperature', parseFloat(this.value));
        });
    }
}

function loadEvaluationSettings() {
    console.log('Loading evaluation settings...');
    console.log('Current DOM elements state:', {
        provider: !!evaluationProviderSelect,
        model: !!evaluationModelInput,
        endpoint: !!evaluationEndpointInput,
        temperature: !!evaluationTemperatureInput
    });

    // Use the same evalProviderData, evalModelData, etc. that were already loaded
    if (!evalProviderData || !evalModelData || !evalEndpointData || !evalTempData) {
        console.log('Evaluation settings not loaded yet, skipping...');
        return;
    }

    // Set provider
    if (evaluationProviderSelect && evalProviderData && evalProviderData.settings) {
        const providerValue = evalProviderData.settings.value || 'openai_endpoint';
        console.log('Setting evaluation provider to:', providerValue);
        evaluationProviderSelect.value = providerValue;

        // Show/hide endpoint field
        if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
            evaluationEndpointInput.parentNode.style.display =
                providerValue === 'openai_endpoint' ? 'block' : 'none';
        }
    }

    // Set model
    if (evaluationModelInput && evalModelData && evalModelData.settings) {
        const modelValue = evalModelData.settings.value || 'anthropic/claude-3.7-sonnet';
        console.log('Setting evaluation model to:', modelValue);

        // Wait for models to be loaded, then set the value
        const setModelValue = () => {
            const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
            const models = window.evaluationModels && window.evaluationModels[provider] ?
                window.evaluationModels[provider] : [];

            // Find the matching model to get the label
            const matchingModel = models.find(m => m.value === modelValue);

            if (matchingModel) {
                // Set the display value to the label
                evaluationModelInput.value = matchingModel.label;
                console.log('Found matching model, setting label:', matchingModel.label);
            } else {
                // If no match, just set the raw value
                evaluationModelInput.value = modelValue;
                console.log('No matching model found, setting raw value:', modelValue);
            }

            // Update hidden input
            const hiddenInput = document.getElementById('evaluation_model_hidden');
            if (hiddenInput) {
                hiddenInput.value = modelValue;
            }

            // Use the dropdown instance's setValue method if available
            if (window.evaluationDropdownInstance && window.evaluationDropdownInstance.setValue) {
                window.evaluationDropdownInstance.setValue(modelValue, false);
            }
        };

        // If models are already loaded, set immediately
        if (window.evaluationModels && Object.keys(window.evaluationModels).length > 0) {
            setModelValue();
        } else {
            // Otherwise wait for models to load
            setTimeout(setModelValue, 1000);
        }
    }

    // Set endpoint URL
    if (evaluationEndpointInput && evalEndpointData && evalEndpointData.settings) {
        const endpointValue = evalEndpointData.settings.value || 'https://openrouter.ai/api/v1';
        console.log('Setting evaluation endpoint to:', endpointValue);
        evaluationEndpointInput.value = endpointValue;
    }

    // Set temperature
    if (evaluationTemperatureInput && evalTempData && evalTempData.settings) {
        const tempValue = evalTempData.settings.value || 0;
        console.log('Setting evaluation temperature to:', tempValue);
        evaluationTemperatureInput.value = tempValue;
    }
}

function setEvaluationDefaults() {
    console.log('Setting evaluation defaults');
    if (evaluationProviderSelect) {
        evaluationProviderSelect.value = 'openai_endpoint';
        // Show endpoint field for default provider
        if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
            evaluationEndpointInput.parentNode.style.display = 'block';
        }
    }
    if (evaluationModelInput) evaluationModelInput.value = 'anthropic/claude-3.7-sonnet';
    if (evaluationEndpointInput) evaluationEndpointInput.value = 'https://openrouter.ai/api/v1';
    if (evaluationTemperatureInput) evaluationTemperatureInput.value = 0;
}

function saveEvaluationSetting(key, value) {
    console.log('Saving evaluation setting:', key, '=', value);

    // Get CSRF token
    const csrfToken = document.querySelector('meta[name="csrf-token"]')?.getAttribute('content') || '';

    fetch(`/settings/api/${key}`, {
        method: 'PUT',
        headers: {
            'Content-Type': 'application/json',
            'X-CSRFToken': csrfToken
        },
        body: JSON.stringify({ value: value })
    })
    .then(response => response.json())
    .then(data => {
        if (data.success) {
            console.log('Successfully saved evaluation setting:', key);
        } else {
            console.error('Failed to save evaluation setting:', data.error);
        }
    })
    .catch(error => {
        console.error('Error saving evaluation setting:', error);
    });
}


</script>
{% endblock %}

{% block page_scripts %}
<!-- Load required services for progress tracking -->
<script src="{{ url_for('research.serve_static', path='js/services/audio.js') }}"></script>
<script src="{{ url_for('research.serve_static', path='js/services/ui.js') }}"></script>
<script src="{{ url_for('research.serve_static', path='js/services/formatting.js') }}"></script>
<script src="{{ url_for('research.serve_static', path='js/services/api.js') }}"></script>
<script src="{{ url_for('research.serve_static', path='js/services/socket.js') }}"></script>
<!-- Load custom dropdown component for evaluation model selection -->
<script src="{{ url_for('research.serve_static', path='js/components/custom_dropdown.js') }}"></script>
{% endblock %}
